The (interactive) correlation heatmap reveals very high correlation among TG compounds.
Performing a Global Test on the serum metabolites of AD patients, correcting for sex (Ho: E4 status has no effect on metabolite levels, Ha: it has an effect), yields a significant difference (p = 0.046) between E4 carriers and non-carriers. Testing if the counts of E4 alleles have an effect on metabolite levels showed no significant nuance.
load("data.Rdata")
gdf <- subset(df, Diagnosis == "Probable AD")
gdf$sex <- as.numeric(gdf$sex) -1
# Binary outcome
gt.b <- globaltest::gt(E4 ~ 1, E4 ~ . - APOE - Diagnosis, data = gdf)
gt.b
# Multinomial outcome
gdf$APOE <- as.factor(gdf$APOE)
gt.m <- globaltest::gt(APOE ~ 1, APOE ~ . - E4 -Diagnosis, data = gdf)
gt.mTriglycerides and diglycerides seem to survive FDR control.
load("ADdata.Rdata")
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$APOEb == "E4NO", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$APOEb == "E4YES", 1:230]
# Create a function to perform the Wilcoxon rank-sum (Mann-Whitney U) test on two vectors
MannWhitneyU <- function(x, y) {
wilcoxon <- wilcox.test(x, y, paired = FALSE, alternative = "less")
return(wilcoxon$p.value)
}
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)Carriers of one copy of E4 vs two copies tend to have less histamine, fumaric acid, uracil, triglyceride TG.48.0, phosphatidylcholine PC.36.4, with p < 0.05, however these don’t survive FDRcut at 0.95 (p_adj = 0.97)’
geno$g <- geno$APOE
geno$g[geno$g == "E3E4"] <- geno$g[geno$g == "E2E4"] <- "E4x1"
geno$g[geno$g == "E4E4"] <- "E4x2"
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$g == "E4x1", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$g == "E4x2", 1:230]
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)Testing no E4 vs one E4, no metabolites differ significantly.
geno$g <- geno$APOE
geno$g[geno$g == "E3E4"] <- geno$g[geno$g == "E2E4"] <- "E4x1"
geno$g[geno$APOEb == "E4NO"] <- "E4x0"
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$g == "E4x1", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$g == "E4x0", 1:230]
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)fit <- function(title,
X,
y,
model,
ctrl = NULL,
grid = NULL,
seed = 123, ...) {
set.seed(seed)
# Merge X and y into df
df <- merge.data.frame(X, y)
# Train the model
mdl <- caret::train(df[, 1:ncol(X)], df$y,
method = model,
tuneGrid = grid,
trControl = ctrl,
metric = "ROC",
preProcess = c("center", "scale"),
...
)
# Create a confusion matrix and get performance metrics from caret
obs <- mdl$pred$obs
preds <- mdl$pred$pred
cm <- confusionMatrix(obs, preds,
dnn = c("X0", "X1"), # nolint
positive = "X1")
# Predictions
ys <- as.numeric(obs) -1
yhats <- mdl$pred$X1
roc <- roc(ys, yhats,
levels = c(0, 1),
ci = TRUE, boot.n = 1000, ci.alpha = 0.95)
metrics <- data.frame(c(cm$byClass, roc$auc),
row.names = c(names(cm$byClass), "AUC")
)
names(metrics) <- title
out <- list("metrics" = metrics, "roc" = roc, "model" = mdl)
return(out)
}load("./data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
y <- df[df$Diagnosis == "Probable AD", "E4"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
# Define the model training parameters, repeated 10-fold cross-valuidation
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 1,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE,
sampling = 'smote',
)lr <- fit(
title = "Logistic Regression",
X = X,
y = y,
model = "glmnet",
ctrl = ctrl,
grid = expand.grid(alpha = c(0), lambda = c(100)))Error: package MLmetrics is required
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 37 35 48
X1 56 58 69
X2 18 27 33
Overall Statistics
Accuracy : 0.336
95% CI : (0.2887, 0.3858)
No Information Rate : 0.3937
P-Value [Acc > NIR] : 0.9913
Kappa : 0.0182
Mcnemar's Test P-Value : 4.932e-08
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.33333 0.4833 0.22000
Specificity 0.69259 0.5211 0.80519
Pos Pred Value 0.30833 0.3169 0.42308
Neg Pred Value 0.71648 0.6869 0.61386
Prevalence 0.29134 0.3150 0.39370
Detection Rate 0.09711 0.1522 0.08661
Detection Prevalence 0.31496 0.4803 0.20472
Balanced Accuracy 0.51296 0.5022 0.51260
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 17 14 9
X1 12 36 13
X2 9 9 8
Overall Statistics
Accuracy : 0.4803
95% CI : (0.3909, 0.5707)
No Information Rate : 0.4646
P-Value [Acc > NIR] : 0.3941
Kappa : 0.1806
Mcnemar's Test P-Value : 0.8300
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.4474 0.6102 0.26667
Specificity 0.7416 0.6324 0.81443
Pos Pred Value 0.4250 0.5902 0.30769
Neg Pred Value 0.7586 0.6515 0.78218
Prevalence 0.2992 0.4646 0.23622
Detection Rate 0.1339 0.2835 0.06299
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.5945 0.6213 0.54055
#Get a performance metrics table
multimetrics <- cbind(mlr$cm$byClass[3,], mtree$cm$byClass[3,], mrf$cm$byClass[3,])
multimetrics [,1] [,2] [,3]
Sensitivity 0.26804124 0.22000000 0.26666667
Specificity 0.81690141 0.80519481 0.81443299
Pos Pred Value 0.33333333 0.42307692 0.30769231
Neg Pred Value 0.76567657 0.61386139 0.78217822
Precision 0.33333333 0.42307692 0.30769231
Recall 0.26804124 0.22000000 0.26666667
F1 0.29714286 0.28947368 0.28571429
Prevalence 0.25459318 0.39370079 0.23622047
Detection Rate 0.06824147 0.08661417 0.06299213
Detection Prevalence 0.20472441 0.20472441 0.20472441
Balanced Accuracy 0.54247132 0.51259740 0.54054983
#Plot ROC curves
mrocs <- list(mlr$roc, mtree$roc, mrf$roc)
# Generate labels
labels <- paste0(names(metrics), ", AUC = ", paste(round(metrics[12, ], 2)))
# httpgd::hgd()
ggroc(mrocs) +
theme_clean() +
scale_color_tableau(labels = labels)Error in ggroc.list(mrocs) :
All elements in 'data' must be 'roc' objects.
load("thomson.Rdata")
X <- thomson
y <- df[df$Diagnosis == "Probable AD", "E4"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))ctrl$summaryFunction <- twoClassSummary
lrf <- fit(title = "Logistic Regression",
X = X,
y = y,
model = "glm",
ctrl = ctrl,
)Setting direction: controls > cases
treef <- fit(
title = "Decision Tree",
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth = c(2))
)Setting direction: controls < cases
param <- data.frame(nrounds = c(10), max_depth = c(2), eta = c(0.3), gamma = c(0), colsample_bytree = 1, min_child_weight = c(1), subsample = c(1))
rff <- fit(title = "XGBoost",
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = param
)Setting direction: controls > cases
#Get a performance metrics table
metricsf <- cbind(lrf$metrics, treef$metrics, rff$metrics)
metricsf#Plot ROC curves
rocsf <- list(lrf$roc, treef$roc, rff$roc)
# Generate labels
labels <- paste0(names(metrics), ", AUC = ", paste(round(metrics[12, ], 2)))
# httpgd::hgd()
ggroc(rocsf) +
theme_clean() +
scale_color_tableau(labels = labels)ctrl$summaryFunction <- multiClassSummary
X <- df[df$Diagnosis == "Probable AD", 1:230]
y <- df[df$Diagnosis == "Probable AD", "APOE"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
mlrf <- multifit(
X = X,
y = y,
model = "multinom",
ctrl = ctrl,
trace = FALSE,
tuneLength = 3
)Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 59 43 18
X1 50 80 53
X2 12 46 20
Overall Statistics
Accuracy : 0.4173
95% CI : (0.3673, 0.4686)
No Information Rate : 0.4436
P-Value [Acc > NIR] : 0.8606
Kappa : 0.0867
Mcnemar's Test P-Value : 0.5277
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.4876 0.4734 0.21978
Specificity 0.7654 0.5142 0.80000
Pos Pred Value 0.4917 0.4372 0.25641
Neg Pred Value 0.7625 0.5505 0.76568
Prevalence 0.3176 0.4436 0.23885
Detection Rate 0.1549 0.2100 0.05249
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.6265 0.4938 0.50989
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 42 29 49
X1 57 56 70
X2 19 29 30
Overall Statistics
Accuracy : 0.336
95% CI : (0.2887, 0.3858)
No Information Rate : 0.3911
P-Value [Acc > NIR] : 0.9886
Kappa : 0.0216
Mcnemar's Test P-Value : 1.477e-08
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.3559 0.4912 0.20134
Specificity 0.7034 0.5243 0.79310
Pos Pred Value 0.3500 0.3060 0.38462
Neg Pred Value 0.7088 0.7071 0.60726
Prevalence 0.3097 0.2992 0.39108
Detection Rate 0.1102 0.1470 0.07874
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.5297 0.5078 0.49722
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 17 14 9
X1 12 36 13
X2 9 9 8
Overall Statistics
Accuracy : 0.4803
95% CI : (0.3909, 0.5707)
No Information Rate : 0.4646
P-Value [Acc > NIR] : 0.3941
Kappa : 0.1806
Mcnemar's Test P-Value : 0.8300
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.4474 0.6102 0.26667
Specificity 0.7416 0.6324 0.81443
Pos Pred Value 0.4250 0.5902 0.30769
Neg Pred Value 0.7586 0.6515 0.78218
Prevalence 0.2992 0.4646 0.23622
Detection Rate 0.1339 0.2835 0.06299
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.5945 0.6213 0.54055
#Get a performance metrics table
multimetricsf <- cbind(mlrf$cm$byClass[3,], mtreef$cm$byClass[3,], mrff$cm$byClass[3,])
multimetricsf [,1] [,2] [,3]
Sensitivity 0.21978022 0.20134228 0.26666667
Specificity 0.80000000 0.79310345 0.81443299
Pos Pred Value 0.25641026 0.38461538 0.30769231
Neg Pred Value 0.76567657 0.60726073 0.78217822
Precision 0.25641026 0.38461538 0.30769231
Recall 0.21978022 0.20134228 0.26666667
F1 0.23668639 0.26431718 0.28571429
Prevalence 0.23884514 0.39107612 0.23622047
Detection Rate 0.05249344 0.07874016 0.06299213
Detection Prevalence 0.20472441 0.20472441 0.20472441
Balanced Accuracy 0.50989011 0.49722287 0.54054983
#Plot ROC curves
mrocsf <- list(mlrf$roc, mtreef$roc, mrff$roc)
# Generate labels
labels <- paste0(names(metrics), ", AUC = ", paste(round(metrics[12, ], 2)))
# httpgd::hgd()
ggroc(mrocsf) +
theme_clean() +
scale_color_tableau(labels = labels)Error in ggroc.list(mrocsf) :
All elements in 'data' must be 'roc' objects.